Prerequisites (Conda py3.8 Environment)

Follow the article, “How to Run Python’s Scikit-Learn in R in 5 minutes”

Setup R’s Python interface

library(reticulate)

# Replace this with your conda environment containing sklearn, pandas, & numpy
use_condaenv("py3.8", required = TRUE)

Data Wrangling & Preprocessing (R)

library(tidyverse)
library(lubridate)

Data Ingest

hr_data_tbl <- read_csv("data/HRDataset_v13.csv")

hr_data_tbl
## # A tibble: 401 x 35
##    Employee_Name  EmpID MarriedID MaritalStatusID GenderID EmpStatusID DeptID
##    <chr>          <dbl>     <dbl>           <dbl>    <dbl>       <dbl>  <dbl>
##  1 Brown, Mia    1.10e9         1               1        0           1      1
##  2 LaRotonda, W… 1.11e9         0               2        1           1      1
##  3 Steans, Tyro… 1.30e9         0               0        1           1      1
##  4 Howard, Este… 1.21e9         1               1        0           1      1
##  5 Singh, Nan    1.31e9         0               0        0           1      1
##  6 Smith, Leigh… 7.11e8         1               1        0           5      1
##  7 Bunbury, Jes… 1.50e9         1               1        0           5      6
##  8 Carter, Mich… 1.40e9         0               0        0           1      6
##  9 Dietrich, Je… 1.41e9         0               0        0           1      6
## 10 Digitale, Al… 1.31e9         1               1        1           1      6
## # … with 391 more rows, and 28 more variables: PerfScoreID <dbl>,
## #   FromDiversityJobFairID <dbl>, PayRate <dbl>, Termd <dbl>, PositionID <dbl>,
## #   Position <chr>, State <chr>, Zip <chr>, DOB <chr>, Sex <chr>,
## #   MaritalDesc <chr>, CitizenDesc <chr>, HispanicLatino <chr>, RaceDesc <chr>,
## #   DateofHire <chr>, DateofTermination <chr>, TermReason <chr>,
## #   EmploymentStatus <chr>, Department <chr>, ManagerName <chr>,
## #   ManagerID <dbl>, RecruitmentSource <chr>, PerformanceScore <chr>,
## #   EngagementSurvey <dbl>, EmpSatisfaction <dbl>, SpecialProjectsCount <dbl>,
## #   LastPerformanceReview_Date <chr>, DaysLateLast30 <dbl>
hr_data_tbl %>% glimpse()

Age and Tenure Features

hr_data_tbl <- hr_data_tbl %>%
    mutate(AgeRel    = ( mdy(DOB) - min(mdy(DOB), na.rm = TRUE) ) / dyears(1)) %>%
    mutate(TenureRel = ( mdy(DateofHire) - min(mdy(DateofHire), na.rm = TRUE) ) / dyears(1))

hr_data_tbl %>% select(AgeRel, TenureRel)
## # A tibble: 401 x 2
##    AgeRel TenureRel
##     <dbl>     <dbl>
##  1   18.8      2.80
##  2   15.2      7.99
##  3   17.6      8.72
##  4   16.6      9.10
##  5   19.3      9.31
##  6   18.3      5.71
##  7   95.3      5.60
##  8   94.3      8.61
##  9   18.3      6.11
## 10   19.6      8.61
## # … with 391 more rows
hr_data_tbl %>%
    ggplot(aes(AgeRel)) +
    geom_histogram()

Subset & Fix Missingness

library(DataExplorer)
hr_data_tbl %>% plot_missing()

selections <- c(
    "Employee_Name", 
    "Sex", "MaritalDesc", 
    "PayRate", "Department",
    "AgeRel"
    )

hr_subset_tbl <- hr_data_tbl %>%
    select(one_of(selections)) %>%
    drop_na()

hr_subset_tbl %>% glimpse()
## Rows: 310
## Columns: 6
## $ Employee_Name <chr> "Brown, Mia", "LaRotonda, William", "Steans, Tyrone", "…
## $ Sex           <chr> "F", "M", "M", "F", "F", "F", "F", "F", "F", "M", "M", …
## $ MaritalDesc   <chr> "Married", "Divorced", "Single", "Married", "Single", "…
## $ PayRate       <dbl> 28.50, 23.00, 29.00, 21.50, 16.56, 20.50, 55.00, 55.00,…
## $ Department    <chr> "Admin Offices", "Admin Offices", "Admin Offices", "Adm…
## $ AgeRel        <dbl> 18.78713210, 15.20876112, 17.55783710, 16.59958932, 19.…
hr_subset_tbl %>% plot_missing()

Preprocessing (recipes)

library(recipes)

rec_obj <- recipe(~ ., hr_subset_tbl) %>%
    step_rm(Employee_Name) %>%
    step_normalize(all_numeric()) %>%
    step_dummy(all_nominal()) %>%
    prep()
    
rec_obj
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##  predictor          6
## 
## Training data contained 310 data points and no missing data.
## 
## Operations:
## 
## Variables removed Employee_Name [trained]
## Centering and scaling for PayRate, AgeRel [trained]
## Dummy variables from Sex, MaritalDesc, Department [trained]
hr_subset_processed_tbl <- juice(rec_obj)

hr_subset_processed_tbl %>% glimpse()
## Rows: 310
## Columns: 12
## $ PayRate                         <dbl> -0.1810242, -0.5385474, -0.1485221, -…
## $ AgeRel                          <dbl> -0.16653976, -0.29271412, -0.20988505…
## $ Sex_M                           <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1…
## $ MaritalDesc_Married             <dbl> 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1…
## $ MaritalDesc_Separated           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ MaritalDesc_Single              <dbl> 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0…
## $ MaritalDesc_Widowed             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ Department_Executive.Office     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Department_IT.IS                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Department_Production           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Department_Sales                <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1…
## $ Department_Software.Engineering <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
hr_subset_processed_tbl %>%
    ggplot(aes(AgeRel)) +
    geom_histogram()

# Prep for Python
X              <- as.matrix(hr_subset_processed_tbl)
employee_names <- hr_subset_tbl$Employee_Name

Machine Learning (Python)

Clustering - Python

# Data Manipulation
import pandas as pd
import numpy as np

R to Python

r.X
## array([[-0.18102419, -0.16653976,  0.        , ...,  0.        ,
##          0.        ,  0.        ],
##        [-0.53854743, -0.29271412,  1.        , ...,  0.        ,
##          0.        ,  0.        ],
##        [-0.14852207, -0.20988505,  1.        , ...,  0.        ,
##          0.        ,  0.        ],
##        ...,
##        [ 1.46358272, -0.33538365,  1.        , ...,  0.        ,
##          0.        ,  0.        ],
##        [ 1.41157934,  2.57647355,  1.        , ...,  0.        ,
##          0.        ,  0.        ],
##        [ 1.55458864, -0.50374485,  0.        , ...,  0.        ,
##          0.        ,  0.        ]])
pd.Series(r.employee_names)
## 0                Brown, Mia
## 1        LaRotonda, William
## 2            Steans, Tyrone
## 3           Howard, Estelle
## 4                Singh, Nan
##                ...         
## 305            Daniele, Ann
## 306          Lajiri,  Jyoti
## 307    Semizoglou, Jeremiah
## 308              South, Joe
## 309         Warfield, Sarah
## Length: 310, dtype: object

Affinity Propagation

# Machine Learning
from sklearn.cluster import AffinityPropagation
af = AffinityPropagation().fit(r.X)
af
## AffinityPropagation(affinity='euclidean', convergence_iter=15, copy=True,
##                     damping=0.5, max_iter=200, preference=None, verbose=False)
af.cluster_centers_indices_
## array([  8,  15,  22,  27,  39,  42,  43,  48,  54,  82,  84,  90, 101,
##        106, 110, 144, 166, 176, 184, 249, 286, 289, 306])
cluster_assignments_af = af.labels_
cluster_assignments_af
## array([13, 15, 16, 13, 11, 13,  8,  2,  0,  3,  1,  0,  8,  1,  3,  1,  1,
##         3,  3,  1,  3,  0,  2,  0,  3,  1,  1,  3,  0,  0,  3,  2,  2,  7,
##        20,  5,  5, 22,  4,  4,  6,  4,  5,  6,  7,  5,  5,  7,  7,  5,  5,
##         5,  6,  9,  8, 20, 20, 18, 22, 20,  6, 12,  6,  7,  5,  5,  7,  7,
##        20,  7,  7,  5, 22,  8,  2,  9,  9,  9,  9, 10, 10, 10,  9, 10, 10,
##         9, 10, 19, 11, 16, 11, 19, 14, 11, 13, 13, 15, 14, 11, 14, 13, 12,
##        19, 11, 13, 14, 13, 13, 15, 11, 14, 13, 18, 14, 15, 19, 18, 11, 13,
##        13, 16, 11, 16, 14, 11, 14, 13, 11, 16, 17, 17, 11, 17, 13, 11, 11,
##        13, 11, 17, 14, 13, 13, 17, 13, 15, 13, 12, 11, 16, 16, 13, 16, 14,
##        11, 17, 18, 16, 12, 14, 12, 12, 14, 13, 15, 18, 11, 16, 11, 11, 16,
##        16, 11, 14, 13, 13, 12, 17, 11, 14, 17, 16, 13, 16, 13, 18, 11, 13,
##        11, 17, 17, 16, 17, 15, 16, 15, 12, 11, 16, 13, 13, 19, 16, 13, 11,
##        16, 15, 16, 14, 11, 17, 16, 16, 14, 19, 14, 12, 13, 11, 16, 16, 13,
##        11, 13, 16, 11, 16, 17, 13, 13, 11, 13, 19, 11, 16, 13, 12, 14, 13,
##        17, 14, 17, 11, 12, 13, 11, 11, 11, 12, 19, 19, 18, 16, 11, 14, 16,
##        14, 14, 13, 17, 11, 13, 16, 13, 16, 17, 14, 11, 14, 16, 11, 17, 13,
##        17, 15, 11, 16, 11, 17, 13, 19, 15,  0,  0,  3,  4, 20, 20,  3,  3,
##        21, 21, 21, 21, 21, 21, 21, 21, 21, 12,  7,  7, 22,  4,  2,  4,  8,
##        22, 22,  2,  4])

DBSCAN

from sklearn.cluster import DBSCAN
db = DBSCAN(min_samples=5).fit(r.X)
db
## DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
##        metric_params=None, min_samples=5, n_jobs=None, p=None)
cluster_assignments_db = db.labels_
cluster_assignments_db
## array([-1, -1, -1, -1, -1, -1, -1, -1,  0, -1,  1, -1, -1,  1, -1,  1,  1,
##        -1, -1,  1, -1,  0, -1,  0, -1,  1,  1, -1,  0,  0, -1, -1, -1,  2,
##        -1,  3,  3, -1, -1, -1, -1, -1,  3, -1,  2, -1,  3,  2,  2,  3, -1,
##        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  3,  2, -1,
##        -1, -1,  2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
##        -1, -1, -1,  4,  5,  4, -1,  6,  4,  7,  7, -1,  6,  4,  6,  7,  8,
##        -1,  4,  7,  6,  7,  7, 10,  4,  6,  7, -1,  6, -1, -1, -1,  4,  7,
##         7,  5,  4,  5,  6,  4,  6,  7,  4,  5, -1,  9,  4, -1,  7,  4,  4,
##         7,  4,  9,  6,  7,  7,  9,  7, 10,  7,  8,  4,  5,  5,  7,  5,  6,
##         4,  9, -1,  5,  8,  6, -1,  8,  6,  7, -1, -1,  4,  5,  4,  4,  5,
##         5,  4,  6,  7,  7,  8,  9,  4,  6, -1,  5,  7,  5,  7, -1,  4,  7,
##         4,  9,  9,  5,  9, 10,  5, 10, -1,  4,  5,  7,  7, -1,  5,  7,  4,
##         5, 10,  5,  6,  4,  9,  5,  5,  6, -1,  6,  8,  7,  4,  5,  5,  7,
##         4,  7,  5,  4,  5,  9,  7,  7,  4,  7, -1,  4,  5,  7, -1,  6,  7,
##         9,  6,  9,  4,  8,  7,  4,  4,  4, -1, -1, -1, -1,  5,  4,  6,  5,
##         6,  6,  7,  9,  4,  7,  5,  7,  5, -1,  6,  4,  6,  5,  4, -1,  7,
##        -1, -1,  4,  5,  4, -1,  7, -1, -1,  0,  0, -1, -1, -1, -1, -1, -1,
##        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
##        -1, -1, -1, -1])

TSNE Low-Dimensional Embedding - Python

Needed to create a reduced representation of the original data in 2-D space.

from sklearn.manifold import TSNE
X_embedded = TSNE(n_components=2, random_state=123).fit_transform(r.X)

pd.DataFrame(X_embedded)
##              0          1
## 0     2.663728  10.084321
## 1     6.798886 -10.097016
## 2     5.099703 -14.169149
## 3     3.235934   9.874836
## 4    27.622253  -7.657422
## ..         ...        ...
## 305 -20.879011  -0.229556
## 306  -4.612731  13.428928
## 307  -4.403336  13.283936
## 308 -22.674707  -0.892780
## 309  -8.842725  11.582272
## 
## [310 rows x 2 columns]

Py to R

Getting Scikit-Learn Results in RMarkdown

# Affinity Propogation
py$cluster_assignments_af
##   [1] 13 15 16 13 11 13  8  2  0  3  1  0  8  1  3  1  1  3  3  1  3  0  2  0  3
##  [26]  1  1  3  0  0  3  2  2  7 20  5  5 22  4  4  6  4  5  6  7  5  5  7  7  5
##  [51]  5  5  6  9  8 20 20 18 22 20  6 12  6  7  5  5  7  7 20  7  7  5 22  8  2
##  [76]  9  9  9  9 10 10 10  9 10 10  9 10 19 11 16 11 19 14 11 13 13 15 14 11 14
## [101] 13 12 19 11 13 14 13 13 15 11 14 13 18 14 15 19 18 11 13 13 16 11 16 14 11
## [126] 14 13 11 16 17 17 11 17 13 11 11 13 11 17 14 13 13 17 13 15 13 12 11 16 16
## [151] 13 16 14 11 17 18 16 12 14 12 12 14 13 15 18 11 16 11 11 16 16 11 14 13 13
## [176] 12 17 11 14 17 16 13 16 13 18 11 13 11 17 17 16 17 15 16 15 12 11 16 13 13
## [201] 19 16 13 11 16 15 16 14 11 17 16 16 14 19 14 12 13 11 16 16 13 11 13 16 11
## [226] 16 17 13 13 11 13 19 11 16 13 12 14 13 17 14 17 11 12 13 11 11 11 12 19 19
## [251] 18 16 11 14 16 14 14 13 17 11 13 16 13 16 17 14 11 14 16 11 17 13 17 15 11
## [276] 16 11 17 13 19 15  0  0  3  4 20 20  3  3 21 21 21 21 21 21 21 21 21 12  7
## [301]  7 22  4  2  4  8 22 22  2  4
# DBSCAN
py$cluster_assignments_db
##   [1] -1 -1 -1 -1 -1 -1 -1 -1  0 -1  1 -1 -1  1 -1  1  1 -1 -1  1 -1  0 -1  0 -1
##  [26]  1  1 -1  0  0 -1 -1 -1  2 -1  3  3 -1 -1 -1 -1 -1  3 -1  2 -1  3  2  2  3
##  [51] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  3  2 -1 -1 -1  2 -1 -1 -1 -1
##  [76] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  4  5  4 -1  6  4  7  7 -1  6  4  6
## [101]  7  8 -1  4  7  6  7  7 10  4  6  7 -1  6 -1 -1 -1  4  7  7  5  4  5  6  4
## [126]  6  7  4  5 -1  9  4 -1  7  4  4  7  4  9  6  7  7  9  7 10  7  8  4  5  5
## [151]  7  5  6  4  9 -1  5  8  6 -1  8  6  7 -1 -1  4  5  4  4  5  5  4  6  7  7
## [176]  8  9  4  6 -1  5  7  5  7 -1  4  7  4  9  9  5  9 10  5 10 -1  4  5  7  7
## [201] -1  5  7  4  5 10  5  6  4  9  5  5  6 -1  6  8  7  4  5  5  7  4  7  5  4
## [226]  5  9  7  7  4  7 -1  4  5  7 -1  6  7  9  6  9  4  8  7  4  4  4 -1 -1 -1
## [251] -1  5  4  6  5  6  6  7  9  4  7  5  7  5 -1  6  4  6  5  4 -1  7 -1 -1  4
## [276]  5  4 -1  7 -1 -1  0  0 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
## [301] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
X_embedded_tbl <- py$X_embedded %>% as_tibble()
X_embedded_tbl
## # A tibble: 310 x 2
##        V1       V2
##     <dbl>    <dbl>
##  1   2.66  10.1   
##  2   6.80 -10.1   
##  3   5.10 -14.2   
##  4   3.24   9.87  
##  5  27.6   -7.66  
##  6   3.28   9.86  
##  7 -21.3   -0.0448
##  8 -22.2   -0.0107
##  9 -11.1   12.9   
## 10  -7.94  15.1   
## # … with 300 more rows

Visualizations (R)

library(plotly)
library(tidyquant)

Data Preparation

employee_clustering_tbl <- tibble(
    Employee_Name = employee_names,
    cluster_af    = py$cluster_assignments_af,
    cluster_db    = py$cluster_assignments_db,
) %>%
    bind_cols(X_embedded_tbl) %>%
    left_join(hr_data_tbl)

employee_clustering_tbl
## # A tibble: 310 x 41
##    Employee_Name cluster_af cluster_db     V1       V2  EmpID MarriedID
##    <chr>              <dbl>      <dbl>  <dbl>    <dbl>  <dbl>     <dbl>
##  1 Brown, Mia            13         -1   2.66  10.1    1.10e9         1
##  2 LaRotonda, W…         15         -1   6.80 -10.1    1.11e9         0
##  3 Steans, Tyro…         16         -1   5.10 -14.2    1.30e9         0
##  4 Howard, Este…         13         -1   3.24   9.87   1.21e9         1
##  5 Singh, Nan            11         -1  27.6   -7.66   1.31e9         0
##  6 Smith, Leigh…         13         -1   3.28   9.86   7.11e8         1
##  7 Bunbury, Jes…          8         -1 -21.3   -0.0448 1.50e9         1
##  8 Carter, Mich…          2         -1 -22.2   -0.0107 1.40e9         0
##  9 Dietrich, Je…          0          0 -11.1   12.9    1.41e9         0
## 10 Digitale, Al…          3         -1  -7.94  15.1    1.31e9         1
## # … with 300 more rows, and 34 more variables: MaritalStatusID <dbl>,
## #   GenderID <dbl>, EmpStatusID <dbl>, DeptID <dbl>, PerfScoreID <dbl>,
## #   FromDiversityJobFairID <dbl>, PayRate <dbl>, Termd <dbl>, PositionID <dbl>,
## #   Position <chr>, State <chr>, Zip <chr>, DOB <chr>, Sex <chr>,
## #   MaritalDesc <chr>, CitizenDesc <chr>, HispanicLatino <chr>, RaceDesc <chr>,
## #   DateofHire <chr>, DateofTermination <chr>, TermReason <chr>,
## #   EmploymentStatus <chr>, Department <chr>, ManagerName <chr>,
## #   ManagerID <dbl>, RecruitmentSource <chr>, PerformanceScore <chr>,
## #   EngagementSurvey <dbl>, EmpSatisfaction <dbl>, SpecialProjectsCount <dbl>,
## #   LastPerformanceReview_Date <chr>, DaysLateLast30 <dbl>, AgeRel <dbl>,
## #   TenureRel <dbl>
attrition_rate_tbl <- employee_clustering_tbl %>%
    select(cluster_db, Termd) %>%
    group_by(cluster_db) %>%
    summarise(
        term_rate  = sum(Termd) / length(Termd),
        term_count = n()
    ) %>%
    arrange(desc(term_rate))

attrition_rate_tbl
## # A tibble: 12 x 3
##    cluster_db term_rate term_count
##         <dbl>     <dbl>      <int>
##  1         10     0.8            5
##  2          9     0.769         13
##  3          8     0.571          7
##  4          7     0.45          40
##  5          6     0.375         24
##  6          3     0.333          6
##  7          5     0.312         32
##  8          4     0.293         41
##  9         -1     0.262        122
## 10          0     0.143          7
## 11          1     0.143          7
## 12          2     0              6

Attrition by Cluster Visualization

g <- attrition_rate_tbl %>%
    mutate(cluster_db = as_factor(cluster_db) %>% fct_reorder(term_count)) %>%
    ggplot(aes(term_count, cluster_db)) +
    geom_col(aes(fill = term_rate)) +
    theme_tq() +
    labs(title = "Attrition Rate by Employee Cluster",
         fill = "Attr. Rate", x = "Attrition Count", y = "Cluster Assignment")

ggplotly(g)

Cluster Network Visualization - R

data_formatted <- employee_clustering_tbl %>%
    left_join(attrition_rate_tbl) %>%
    mutate(description = str_glue("{Employee_Name}
                                  Position = {Position}
                                  MaritalDesc = {MaritalDesc}
                                  Sex = {Sex}
                                  Race = {RaceDesc}
                                  EmpStatusID = {EmpStatusID}
                                  PayRate = {PayRate}
                                  Terminated = {Termd}
                                  Term Reason = {TermReason}
                                  
                                  Cluster Term Rate: {scales::percent(term_rate)}
                                  Cluster Term Count: {term_count}
                                  
                                  ")
    ) %>%
    select(Employee_Name:V2, description, Termd, 
           term_rate, term_count)

g <- data_formatted %>%
    
    ggplot(aes(V1, V2, color = factor(cluster_db))) +
    geom_point(aes(text = description, size = term_rate), alpha = 0.5) +
    scale_color_tq() +
    theme_tq() +
    # theme(legend.position = "none") + 
    labs(title = "Employee Cluster Assignments", color = "Cluster")
    

ggplotly(g)

Shiny App

Sourcing Python

# Bonus #1!!!
source_python("py/clustering.py")
source_python("py/tsne.py")
# Calls def cluster_dbscan() 
cluster_dbscan(X)
##   [1] -1 -1 -1 -1 -1 -1 -1 -1  0 -1  1 -1 -1  1 -1  1  1 -1 -1  1 -1  0 -1  0 -1
##  [26]  1  1 -1  0  0 -1 -1 -1  2 -1  3  3 -1 -1 -1 -1 -1  3 -1  2 -1  3  2  2  3
##  [51] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  3  2 -1 -1 -1  2 -1 -1 -1 -1
##  [76] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  4  5  4 -1  6  4  7  7 -1  6  4  6
## [101]  7  8 -1  4  7  6  7  7 10  4  6  7 -1  6 -1 -1 -1  4  7  7  5  4  5  6  4
## [126]  6  7  4  5 -1  9  4 -1  7  4  4  7  4  9  6  7  7  9  7 10  7  8  4  5  5
## [151]  7  5  6  4  9 -1  5  8  6 -1  8  6  7 -1 -1  4  5  4  4  5  5  4  6  7  7
## [176]  8  9  4  6 -1  5  7  5  7 -1  4  7  4  9  9  5  9 10  5 10 -1  4  5  7  7
## [201] -1  5  7  4  5 10  5  6  4  9  5  5  6 -1  6  8  7  4  5  5  7  4  7  5  4
## [226]  5  9  7  7  4  7 -1  4  5  7 -1  6  7  9  6  9  4  8  7  4  4  4 -1 -1 -1
## [251] -1  5  4  6  5  6  6  7  9  4  7  5  7  5 -1  6  4  6  5  4 -1  7 -1 -1  4
## [276]  5  4 -1  7 -1 -1  0  0 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
## [301] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
# calls def tsne_embedding()
tsne_embedding(X) %>% head()
##           [,1]       [,2]
## [1,]  2.663728  10.084321
## [2,]  6.798886 -10.097016
## [3,]  5.099703 -14.169149
## [4,]  3.235934   9.874836
## [5,] 27.622253  -7.657422
## [6,]  3.280085   9.856876

Integrate into Shiny App!

# Bonus #2!!!
knitr::include_graphics("img/shiny_app.jpg")